Etude de marché
Importation des csv
dispo_al <- read.csv("C:/Users/Nourredine/Desktop/Fichiers/OpenClassRooms/Livrables/P9_bahloul_nourredine/DAN-P9-data/DisponibiliteAlimentaire_2017.csv", encoding="UTF-8", sep = ";", header = TRUE)
pib <- read.csv("C:/Users/Nourredine/Desktop/Fichiers/OpenClassRooms/Livrables/P9_bahloul_nourredine/DAN-P9-data/Pib_2017_bis.csv", encoding="UTF-8", sep = ";", header = TRUE)
pop <- read.csv("C:/Users/Nourredine/Desktop/Fichiers/OpenClassRooms/Livrables/P9_bahloul_nourredine/DAN-P9-data/Population_2017.csv", encoding="UTF-8", sep = ";", header = TRUE)
#stabilite <- read.csv("C:/Users/Nourredine/Desktop/Fichiers/OpenClassRooms/Livrables/P9_bahloul_nourredine/DAN-P9-data/StabilitePolitique_2017.csv", encoding="UTF-8", sep = ";", header = TRUE)
Dispo_cal <- read.csv("C:/Users/Nourredine/Desktop/Fichiers/OpenClassRooms/Livrables/P9_bahloul_nourredine/DAN-P9-data/Dispo_calorique_2017.csv", encoding = "UTF-8", sep = ";", header = TRUE)
Sta_pol <- read.csv("C:/Users/Nourredine/Desktop/Fichiers/OpenClassRooms/Livrables/P9_bahloul_nourredine/DAN-P9-data/StabilitePolitique_2017.csv", encoding = "UTF-8", sep = ";", header = TRUE)
View(Pib_2017_bis) Importation librairies
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
#library(tidyverse)
#library(tibble)
Première jointure entre PIB & Population
merge1 <- merge(pib, pop, by = c("X.U.FEFF.Pays"), all.x = TRUE)
View(merge1)
Seconde jointure avec la table Dispo_alimentaire
merge2 <- merge(merge1, dispo_al, by = c("X.U.FEFF.Pays"), all.x = TRUE)
View(merge2)
Jointure // merge2 et Dispo_cal
merge2 <- merge(merge2, Dispo_cal, by.x = "X.U.FEFF.Pays", by.y = "X.U.FEFF.Pays", all.x = TRUE)
Jointure // merge, sta_pol
merge2 <- merge(merge2, Sta_pol, by.x = "X.U.FEFF.Pays", by.y = "X.U.FEFF.Country", all.x = TRUE)
Renommage variables
colnames(merge2) <- c("Pays","Pib_dollars_million","pib_habitant_dollars","Population2010","Population2017","Evolution_population","Annee","Production_tonnes","Importations_tonnes","Exportations_tonnes","Dispo_interieure_tonnes","Tx_dependance","Balance_com","Cal_personne_jour","Stabilite_politique")
Vérification types de données
sapply(merge2, class)
## Pays Pib_dollars_million pib_habitant_dollars
## "factor" "factor" "factor"
## Population2010 Population2017 Evolution_population
## "integer" "integer" "factor"
## Annee Production_tonnes Importations_tonnes
## "integer" "integer" "integer"
## Exportations_tonnes Dispo_interieure_tonnes Tx_dependance
## "integer" "integer" "factor"
## Balance_com Cal_personne_jour Stabilite_politique
## "integer" "integer" "numeric"
Certaines variables n’ont pas le bon format Conversion Factor en Numeric
merge2$Pib_dollars_million <- as.numeric(sub("," , ".", merge2$Pib_dollars_million))
merge2$pib_habitant_dollars <- as.numeric(sub("," , ".", merge2$pib_habitant_dollars))
merge2$Evolution_population <- as.numeric(sub("," , ".", merge2$Evolution_population))
merge2$Tx_dependance <- as.numeric(sub("," , ".", merge2$Tx_dependance))
sapply(merge2, class)
## Pays Pib_dollars_million pib_habitant_dollars
## "factor" "numeric" "numeric"
## Population2010 Population2017 Evolution_population
## "integer" "integer" "numeric"
## Annee Production_tonnes Importations_tonnes
## "integer" "integer" "integer"
## Exportations_tonnes Dispo_interieure_tonnes Tx_dependance
## "integer" "integer" "numeric"
## Balance_com Cal_personne_jour Stabilite_politique
## "integer" "integer" "numeric"
Remplacement NA par 0
suppressWarnings(merge2[is.na(merge2)] <- 0)
*** Reste-t-il des valeurs nulles ?***
any(is.na(merge2))
## [1] FALSE
Redisposition des colonnes
merge2 <- merge2[, c("Pays","Annee","Population2010","Population2017","Evolution_population","Pib_dollars_million","pib_habitant_dollars","Cal_personne_jour","Production_tonnes","Dispo_interieure_tonnes","Importations_tonnes","Exportations_tonnes", "Tx_dependance","Balance_com", "Stabilite_politique")]
Dimensions dataset et types de données
str(merge2)
## 'data.frame': 208 obs. of 15 variables:
## $ Pays : Factor w/ 208 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Annee : num 2017 2017 2017 2017 2017 ...
## $ Population2010 : num 29185507 51216964 2948023 35977455 80827002 ...
## $ Population2017 : num 36296113 57009756 2884169 41389189 82658409 ...
## $ Evolution_population : num 0.24 0.11 -0.02 0.15 0.02 -0.09 0.28 0.09 0.08 0.21 ...
## $ Pib_dollars_million : num 1.86e+07 3.49e+08 1.30e+07 1.70e+08 3.68e+09 ...
## $ pib_habitant_dollars : num 513 6122 4514 4110 44552 ...
## $ Cal_personne_jour : num 6 146 55 22 38 0 34 0 233 130 ...
## $ Production_tonnes : num 28000 1667000 13000 275000 1514000 ...
## $ Dispo_interieure_tonnes: num 57000 2118000 47000 277000 1739000 ...
## $ Importations_tonnes : num 29000 514000 38000 2000 842000 0 277000 0 7000 722000 ...
## $ Exportations_tonnes : num 0 63000 0 0 646000 0 0 0 0 10000 ...
## $ Tx_dependance : num 0.51 0.24 0.81 0.01 0.48 0 0.87 0 1 0.5 ...
## $ Balance_com : num 0 -451000 -38000 -2000 -196000 0 -277000 0 -7000 -712000 ...
## $ Stabilite_politique : num -2.8 0 0 0 0 0 -0.33 0 0 0 ...
Distribution des données
summary(merge2)
## Pays Annee Population2010
## Afghanistan : 1 Min. : 0 Min. :0.000e+00
## Afrique du Sud: 1 1st Qu.:2017 1st Qu.:8.549e+05
## Albanie : 1 Median :2017 Median :6.191e+06
## Algérie : 1 Mean :1619 Mean :3.307e+07
## Allemagne : 1 3rd Qu.:2017 3rd Qu.:2.120e+07
## Andorre : 1 Max. :2017 Max. :1.369e+09
## (Other) :202
## Population2017 Evolution_population Pib_dollars_million
## Min. :0.000e+00 Min. :-0.20000 Min. :4.391e+04
## 1st Qu.:1.166e+06 1st Qu.: 0.03000 1st Qu.:5.324e+06
## Median :6.910e+06 Median : 0.08000 Median :2.381e+07
## Mean :3.612e+07 Mean : 0.09774 Mean :3.849e+08
## 3rd Qu.:2.480e+07 3rd Qu.: 0.16250 3rd Qu.:1.752e+08
## Max. :1.421e+09 Max. : 0.53000 Max. :1.954e+10
##
## pib_habitant_dollars Cal_personne_jour Production_tonnes
## Min. : 107 Min. : 0.0 Min. : 0
## 1st Qu.: 2025 1st Qu.: 7.0 1st Qu.: 1000
## Median : 6169 Median : 40.5 Median : 37500
## Mean : 16429 Mean : 55.3 Mean : 582466
## 3rd Qu.: 19551 3rd Qu.: 86.0 3rd Qu.: 203250
## Max. :171278 Max. :239.0 Max. :21914000
##
## Dispo_interieure_tonnes Importations_tonnes Exportations_tonnes
## Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 7000 1st Qu.: 0 1st Qu.: 0
## Median : 54500 Median : 9000 Median : 0
## Mean : 556764 Mean : 67928 Mean : 82567
## 3rd Qu.: 253750 3rd Qu.: 51000 3rd Qu.: 6000
## Max. :18266000 Max. :1069000 Max. :4223000
##
## Tx_dependance Balance_com Stabilite_politique
## Min. :0.0000 Min. :-1059000 Min. :-2.80000
## 1st Qu.:0.0000 1st Qu.: -17000 1st Qu.: 0.00000
## Median :0.1300 Median : 0 Median : 0.00000
## Mean :0.3517 Mean : 18111 Mean :-0.05774
## 3rd Qu.:0.6975 3rd Qu.: 0 3rd Qu.: 0.00000
## Max. :2.2200 Max. : 4220000 Max. : 1.33000
##
Corrélations
res = cor(merge2[,-1]) # -1 here means we look at all columns except the first column
#res
library(corrplot)
## corrplot 0.92 loaded
corrplot(res, type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45)
Sélection de quelques catégories pour comparer
library(tidyverse)
## Registered S3 method overwritten by 'rvest':
## method from
## read_xml.response xml2
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v tibble 3.1.1 v purrr 0.3.4
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'readr' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
df2 = merge2[,c("Pays","Dispo_interieure_tonnes","Production_tonnes","Importations_tonnes","Exportations_tonnes")] %>% # select relevant columns
pivot_longer(c("Dispo_interieure_tonnes","Production_tonnes","Importations_tonnes","Exportations_tonnes"),names_to = 'Categorie')
view(df2)
ggplot(data = df2, aes(x=Categorie,y=value, color=Categorie)) +
geom_boxplot()+
scale_color_brewer(palette="Dark2") +
geom_jitter(shape=16, position=position_jitter(0.2))+
labs(title = 'Marché de la volaille dans la monde',
y='en tonnes',x='catégorie')
Etiqueter les outliers
is_outlier <- function(x) {
return(x < quantile(x, 0.25) - 1.5 * IQR(x) | x > quantile(x, 0.75) + 1.5 * IQR(x))
} # define a function to detect outliers
str(df2)
## tibble [832 x 3] (S3: tbl_df/tbl/data.frame)
## $ Pays : Factor w/ 208 levels "Afghanistan",..: 1 1 1 1 2 2 2 2 3 3 ...
## $ Categorie: chr [1:832] "Dispo_interieure_tonnes" "Production_tonnes" "Importations_tonnes" "Exportations_tonnes" ...
## $ value : num [1:832] 57000 28000 29000 0 2118000 ...
Créer une colonne ‘outlier’
df2$Pays = as.character(df2$Pays)
df7 <- df2 %>%
mutate(is_outlier=ifelse(is_outlier(value), Pays, as.numeric(NA)))
View(df7)
df7$Pays[which(is.na(df7$is_outlier))] <- as.numeric(NA)
View(df7)
Visualisation des outliers
ggplot(data = df7, aes(x=Categorie,y=value, fill=Categorie)) +
geom_boxplot(alpha = 0.7,
outlier.colour='red',
outlier.shape=19,
outlier.size=3,
width = 0.6)+
geom_text(aes(label = Pays), na.rm = TRUE, hjust = -0.2)+
theme_grey() +
labs(title = 'Répartition des postes avec outliers',
y='En tonnes',x='',
caption = 'Trop de pays ressortent en tant qu\'outliers',
subtitle = 'Postes principaux d\'échanges') +
theme(axis.text=element_text(size=10),
legend.text = element_text(size = 10),
legend.title = element_text(size = 11),
legend.position = 'right', aspect.ratio = 1.4,
plot.title = element_text(size = 15, face = "bold"),
plot.subtitle = element_text(size = 10),
plot.caption = element_text(color = "Red", face = "italic", size = 13)
)
Suppression de colonnes
merge2 <- subset(merge2, select=-c(Annee, Population2010, Population2017, Production_tonnes, Dispo_interieure_tonnes, Importations_tonnes, Pib_dollars_million, Cal_personne_jour, Stabilite_politique, Exportations_tonnes))
Mise de ‘Pays’ en index (il ne faut aucune variable non numérique pour le clustering)
library(tibble)
merge2 <- merge2 %>%
column_to_rownames('Pays')
Suppression lignes
merge2 <- merge2[!(row.names(merge2) %in% c("Brésil", "États-Unis d'Amérique","Chine, continentale")), ]
Maintenant on réduit les données
merge2_sc <- as.data.frame(scale(merge2))
summary(merge2_sc)
## Evolution_population pib_habitant_dollars Tx_dependance
## Min. :-2.8559 Min. :-0.6435 Min. :-0.7772
## 1st Qu.:-0.6549 1st Qu.:-0.5672 1st Qu.:-0.7772
## Median :-0.1765 Median :-0.4042 Median :-0.4722
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.6848 3rd Qu.: 0.1274 3rd Qu.: 0.7917
## Max. : 4.1298 Max. : 6.1643 Max. : 4.0602
## Balance_com
## Min. :-5.90705
## 1st Qu.: 0.01834
## Median : 0.11501
## Mean : 0.00000
## 3rd Qu.: 0.11501
## Max. : 5.63097
détermination du type de distance ; ici: euclidienne
dist_mat <- dist(merge2_sc, method = 'euclidean')
Création du dendrogramme par partition hiérarchique
hclust_avg <- hclust(dist_mat, method = 'average')
#plot(hclust_avg)
Coupage du dendrogramme pour ne garder que les principaux clusters
cut_avg <- cutree(hclust_avg, k = 4)
Colorisation des clusters
suppressPackageStartupMessages(library(dendextend))
avg_dend_obj <- as.dendrogram(hclust_avg)
avg_col_dend <- color_branches(avg_dend_obj, h = 4)
plot(avg_col_dend)
Combien d’informations du dataframe sont attribuées à chaque cluster ?
suppressPackageStartupMessages(library(dplyr))
merge2_cl <- mutate(merge2, cluster = cut_avg)
count(merge2_cl,cluster)
## cluster n
## 1 1 195
## 2 2 4
## 3 3 2
## 4 4 4
#install.packages("readxl")
#install.packages("FactoMineR")
#install.packages("factoextra")
Importations librairies nécessaires
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.6.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
Les données ont déjà été mises à l’échelle avec le dendrogramme
Trouver le nombre optimal de clusters
fviz_nbclust(merge2_sc, kmeans, method = "wss")
Réalisation du K-means grâce au nombre optimal K
#make this example reproducible
set.seed(1)
#perform k-means clustering with k = 4 clusters
km <- kmeans(merge2_sc, centers = 4, nstart = 25)
Liste des clusters par individu
#view results
#km
Déploiement du K-means
#plot results of final k-means model
fviz_cluster(km, data = merge2_sc)
Centroïdes
#find means of each cluster
clusters <- aggregate(merge2, by=list(cluster=km$cluster), mean)
clusters
## cluster Evolution_population pib_habitant_dollars Tx_dependance
## 1 1 0.05809524 72997.93 0.2766667
## 2 2 0.04200000 11874.22 0.1053333
## 3 3 0.22705882 5418.09 0.2403922
## 4 4 0.08372093 10712.01 1.0597674
## Balance_com
## 1 -86952.38
## 2 22888.89
## 3 -90098.04
## 4 5000.00
Les pays du cluster 4 disposent beaucoup moins qu’ils ne produisent : ils exportent l’essentiel de leur production Les pays du cluster 3 disposent presque autant qu’il produisent : ils sont auto-suffisants Les pays du cluster 2 disposent plus qu’ils ne produisent : ils ont une réelle dépendance/ demande
#add cluster assigment to original data
final_data <- cbind(merge2, cluster = km$cluster)
#view final data
View(final_data)
Suppression de la variable “cluster” dans le nouveau df “clusters”
clusters = select(clusters, -1)
1ère étape : quelle est la part de chaque composante principale sur le plan factoriel ?
merge2.pca <- prcomp(merge2, center = TRUE, scale. = TRUE)
summary(merge2.pca)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.1163 0.9965 0.9856 0.8884
## Proportion of Variance 0.3115 0.2483 0.2429 0.1973
## Cumulative Proportion 0.3115 0.5598 0.8027 1.0000
Description de l’ACP
str(merge2.pca)
## List of 5
## $ sdev : num [1:4] 1.116 0.997 0.986 0.888
## $ rotation: num [1:4, 1:4] -0.6754 0.5343 -0.3509 0.3678 0.0177 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:4] "Evolution_population" "pib_habitant_dollars" "Tx_dependance" "Balance_com"
## .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
## $ center : Named num [1:4] 9.84e-02 1.63e+04 3.57e-01 -2.02e+04
## ..- attr(*, "names")= chr [1:4] "Evolution_population" "pib_habitant_dollars" "Tx_dependance" "Balance_com"
## $ scale : Named num [1:4] 1.04e-01 2.51e+04 4.59e-01 1.76e+05
## ..- attr(*, "names")= chr [1:4] "Evolution_population" "pib_habitant_dollars" "Tx_dependance" "Balance_com"
## $ x : num [1:205, 1:4] -1.325 -1.102 0.132 -0.289 0.646 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:205] "Afghanistan" "Afrique du Sud" "Albanie" "Algérie" ...
## .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
## - attr(*, "class")= chr "prcomp"
Installation librairies nécessaires
#install.packages("Rtools", force = TRUE)
library(devtools)
## Warning: package 'devtools' was built under R version 3.6.3
## Loading required package: usethis
install_github("vqv/ggbiplot")
## WARNING: Rtools is required to build R packages, but is not currently installed.
##
## Please download and install Rtools 3.5 from https://cran.r-project.org/bin/windows/Rtools/.
## Skipping install of 'ggbiplot' from a github remote, the SHA1 (7325e880) has not changed since last install.
## Use `force = TRUE` to force installation
Mise en forme de l’ACP (représentation par individu)
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following object is masked from 'package:purrr':
##
## compact
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library(ggbiplot)
## Loading required package: scales
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
## Loading required package: grid
ggbiplot(merge2.pca)
Avec des labels pour chaque pays
ggbiplot(merge2.pca, labels=rownames(merge2))
Pas très parlant. Mieux vaut à la place mettre nos clusters
Repassons par la 1ère étape
clusters.pca <- prcomp(clusters, center = TRUE, scale. = TRUE)
summary(clusters.pca)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.3242 1.2037 0.8930 7.359e-17
## Proportion of Variance 0.4384 0.3622 0.1994 0.000e+00
## Cumulative Proportion 0.4384 0.8006 1.0000 1.000e+00
Diagramme plus clair avec une meilleure représentativité
ggbiplot(clusters.pca)
Idem avec un cercle pour s’assurer que chaque vecteur est bien représenté
ggbiplot(clusters.pca, circle = TRUE, obs.scale = 1, var.scale = 1, labels=rownames(clusters))+
ggtitle("ACP marché mondial")+
theme_minimal()+
theme(legend.position = "right")
Eventuellement faire apparaître un deuxième plan factoriel avec la 3ème composante principale pour distinguer les variables dont l’inertie n’était pas forte dans PC1 et PC2
ggbiplot(clusters.pca, choices = c(2,3), circle = TRUE, obs.scale = 1, var.scale = 1, labels=rownames(clusters))+
ggtitle("ACP marché mondial")+
theme_minimal()+
theme(legend.position = "right")
Corrélations entre nos principaux clusters et les différentes variables
boxplot(merge2$pib_habitant_dollars~km$cluster, ylab = "Pib par hanitant en dollars", xlab = "Clusters", col = c("lightblue","greenyellow","mediumpurple1","peachpuff2"))
boxplot(merge2$Tx_dependance~km$cluster, ylab = "Taux de dépendance", xlab = "Clusters", col = c("lightblue","greenyellow","mediumpurple1","peachpuff2"))
boxplot(merge2$Evolution_population~km$cluster, ylab = "Evolution population", xlab = "Clusters", col = c("lightblue","greenyellow","mediumpurple1","peachpuff2"))
cluster_1 <- final_data %>% filter(cluster==1)
View(cluster_1)
cluster_2 <- final_data %>% filter(cluster == 2)
View(cluster_2)
cluster_3 <- final_data %>% filter(cluster == 3)
View(cluster_3)
cluster_4 <- final_data %>% filter(cluster == 4)
View(cluster_4)
clusters_sc <- as.data.frame(scale(clusters))
library(dendextend)
# Create dendrogram for rows
mycols <- c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07")
row_dend <- clusters_sc %>%
dist() %>%
hclust() %>%
as.dendrogram() %>%
set("branches_lwd", 1) %>%
set("branches_k_color", mycols[1:4], k = 4)
# Create dendrogram for columns
col_dend <- clusters_sc %>%
t() %>%
dist() %>%
hclust() %>%
as.dendrogram() %>%
set("branches_lwd", 1) %>%
set("branches_k_color", mycols[1:4], k = 4)
library(heatmaply)
## Loading required package: plotly
##
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## Loading required package: viridis
## Loading required package: viridisLite
## Warning: package 'viridisLite' was built under R version 3.6.3
##
## Attaching package: 'viridis'
## The following object is masked from 'package:scales':
##
## viridis_pal
##
## ======================
## Welcome to heatmaply version 1.3.0
##
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
##
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## You may ask questions at stackoverflow, use the r and heatmaply tags:
## https://stackoverflow.com/questions/tagged/heatmaply
## ======================
# Visualize the heatmap
heatmaply(
clusters_sc,
#seriate = "none",
#row_dend_right = TRUE,
#plot_method = "plotly",
Rowv = row_dend,
Colv = col_dend
)
On “redésindexe” la colonne “Pays”
final_data <- final_data %>%
rownames_to_column('Pays')
On affiche les pays sélectionnés dans un df
filter_country <- filter(final_data, Pays %in% c('Qatar', 'Oman','Koweït','Émirats arabes unis','Bahreïn','Arabie saoudite')) %>%
arrange(desc(Evolution_population))
View(filter_country)
** Comme on ne s’intéresse pas qu’à une seule variable, on obseve les clusters a priori les moins intéressants avec les variables qui nous intéressent**
Part du Pib/habitant au sein du cluster 3
ggplot(data=cluster_3,aes(x=reorder(row.names(cluster_3),Evolution_population),y=Evolution_population)) +
geom_bar(stat ='identity',aes(fill=Evolution_population))+
coord_flip() +
theme_grey() +
scale_fill_gradient(name="")+
labs(title = 'Rang Evolution_population du cluster 3',
y='Taux d\'évolution',x='Pays')+
geom_hline(yintercept = mean(cluster_3$Evolution_population),size = 1, color = 'blue')
ggplot(data=cluster_3,aes(x=reorder(row.names(cluster_3),pib_habitant_dollars),y=pib_habitant_dollars)) +
geom_bar(stat ='identity',aes(fill=pib_habitant_dollars))+
coord_flip() +
theme_grey() +
scale_fill_gradient(name="")+
labs(title = 'Rang pays Pib / habitant du cluster 3',
y='Pib / habitant',x='Pays')+
geom_hline(yintercept = mean(cluster_3$pib_habitant_dollars),size = 1, color = 'blue')
Sources utilisées:
Analyse exploratoire : https://towardsdatascience.com/exploratory-data-analysis-in-r-for-beginners-fe031add7072 Dendrogramme : https://www.datacamp.com/community/tutorials/hierarchical-clustering-R K-means : https://www.statology.org/k-means-clustering-in-r/ ACP : https://www.datacamp.com/community/tutorials/pca-analysis-r Heatmap : https://www.datanovia.com/en/blog/how-to-create-a-beautiful-interactive-heatmap-in-r/